This assignment is aim to solve the problems of for Mini Challenge 2
The global settings of R code chunks in this post is set as follows.
The following code input is to prepare for R Packages Installation.
packages = c('raster','sf','tmap', 'clock','DT', 'ggiraph', 'plotly', 'tidyverse','dplyr','readr','hrbrthemes')
for(p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
library(p, character.only = T)
}
The following code is to import raw data sets from Mini Challenge2(“car-assignment.csv”,“cc_data.csv”,“gps.csv”,“loyalty_data.csv”).
credit_debit <- read_csv("data/cc_data.csv")
loyalty_data <- read_csv("data/loyalty_data.csv")
car_assignment <- read_csv("data/car_assignments.csv")
GPS <- read_csv("data/gps.csv")
glimpse(credit_debit)
Rows: 1,490
Columns: 4
$ timestamp <chr> "1/6/2014 7:28", "1/6/2014 7:34", "1/6/2014 7:35"~
$ location <chr> "Brew've Been Served", "Hallowed Grounds", "Brew'~
$ price <dbl> 11.34, 52.22, 8.33, 16.72, 4.24, 4.17, 28.73, 9.6~
$ last4ccnum <dbl> 4795, 7108, 6816, 9617, 7384, 5368, 7253, 4948, 9~
glimpse(loyalty_data)
Rows: 1,392
Columns: 4
$ timestamp <chr> "1/8/2014", "1/8/2014", "1/14/2014", "1/9/2014", ~
$ location <chr> "Carlyle Chemical Inc.", "Carlyle Chemical Inc.",~
$ price <dbl> 4983.52, 4901.88, 4898.39, 4792.50, 4788.22, 4742~
$ loyaltynum <chr> "L8477", "L5756", "L2769", "L3317", "L8477", "L57~
glimpse(GPS)
Rows: 685,169
Columns: 4
$ Timestamp <chr> "1/6/2014 6:28", "1/6/2014 6:28", "1/6/2014 6:28",~
$ id <dbl> 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35~
$ lat <dbl> 36.07623, 36.07622, 36.07621, 36.07622, 36.07621, ~
$ long <dbl> 24.87469, 24.87460, 24.87444, 24.87425, 24.87417, ~
head(loyalty_data)
# A tibble: 6 x 4
timestamp location price loyaltynum
<chr> <chr> <dbl> <chr>
1 1/8/2014 Carlyle Chemical Inc. 4984. L8477
2 1/8/2014 Carlyle Chemical Inc. 4902. L5756
3 1/14/2014 Abila Airport 4898. L2769
4 1/9/2014 Abila Airport 4792. L3317
5 1/15/2014 Maximum Iron and Steel 4788. L8477
6 1/16/2014 Nationwide Refinery 4743. L5756
head(credit_debit)
# A tibble: 6 x 4
timestamp location price last4ccnum
<chr> <chr> <dbl> <dbl>
1 1/6/2014 7:28 Brew've Been Served 11.3 4795
2 1/6/2014 7:34 Hallowed Grounds 52.2 7108
3 1/6/2014 7:35 Brew've Been Served 8.33 6816
4 1/6/2014 7:36 Hallowed Grounds 16.7 9617
5 1/6/2014 7:37 Brew've Been Served 4.24 7384
6 1/6/2014 7:38 Brew've Been Served 4.17 5368
Using just the credit and loyalty card data, identify the most popular locations, and when they are popular. What anomalies do you see? What corrections would you recommend to correct these anomalies?
Comparison of total amount between credit/debit card and loyalty card
After glimpsing data structure of credit and loyalty card data, the heat map is a good way to visualize the most population locations and its population time.To create this graph,the data aggregation of loyalty card is needed.
loyalty_data$count_event=1
credit_debit$count_event=1
head(loyalty_data)
# A tibble: 6 x 5
timestamp location price loyaltynum count_event
<chr> <chr> <dbl> <chr> <dbl>
1 1/8/2014 Carlyle Chemical Inc. 4984. L8477 1
2 1/8/2014 Carlyle Chemical Inc. 4902. L5756 1
3 1/14/2014 Abila Airport 4898. L2769 1
4 1/9/2014 Abila Airport 4792. L3317 1
5 1/15/2014 Maximum Iron and Steel 4788. L8477 1
6 1/16/2014 Nationwide Refinery 4743. L5756 1
aggregate_dataset <- loyalty_data %>%
group_by(timestamp,location) %>%
dplyr::summarize(Frequency = sum(count_event),Money_loyalty=sum(price))
head(aggregate_dataset)
# A tibble: 6 x 4
# Groups: timestamp [1]
timestamp location Frequency Money_loyalty
<chr> <chr> <dbl> <dbl>
1 1/10/2014 Abila Zacharo 7 171.
2 1/10/2014 Albert's Fine Clothing 1 126.
3 1/10/2014 Bean There Done That 5 60.7
4 1/10/2014 Brew've Been Served 14 132.
5 1/10/2014 Brewed Awakenings 3 33.9
6 1/10/2014 Carlyle Chemical Inc. 2 3717.
credit_debit$timestamp <- as.Date(credit_debit$timestamp, "%m/%d/%Y")
aggregate_cc <- credit_debit %>%
group_by(timestamp,location) %>%
dplyr::summarize(Frequency = sum(count_event),Money_cd=sum(price))
head(aggregate_cc)
# A tibble: 6 x 4
# Groups: timestamp [1]
timestamp location Frequency Money_cd
<date> <chr> <dbl> <dbl>
1 2014-01-06 Abila Airport 4 7803.
2 2014-01-06 Abila Zacharo 6 380.
3 2014-01-06 Albert's Fine Clothing 2 399.
4 2014-01-06 Bean There Done That 5 44.4
5 2014-01-06 Brew've Been Served 16 185.
6 2014-01-06 Brewed Awakenings 3 27.0
head(aggregate_dataset)
# A tibble: 6 x 5
# Groups: timestamp [1]
timestamp location Frequency Money_loyalty Day
<date> <chr> <dbl> <dbl> <chr>
1 2014-01-10 Abila Zacharo 7 171. 10
2 2014-01-10 Albert's Fine Clothing 1 126. 10
3 2014-01-10 Bean There Done That 5 60.7 10
4 2014-01-10 Brew've Been Served 14 132. 10
5 2014-01-10 Brewed Awakenings 3 33.9 10
6 2014-01-10 Carlyle Chemical Inc. 2 3717. 10
head(aggregate_cc)
# A tibble: 6 x 5
# Groups: timestamp [1]
timestamp location Frequency Money_cd Day
<date> <chr> <dbl> <dbl> <chr>
1 2014-01-06 Abila Airport 4 7803. 06
2 2014-01-06 Abila Zacharo 6 380. 06
3 2014-01-06 Albert's Fine Clothing 2 399. 06
4 2014-01-06 Bean There Done That 5 44.4 06
5 2014-01-06 Brew've Been Served 16 185. 06
6 2014-01-06 Brewed Awakenings 3 27.0 06
p <- ggplot(data = aggregate_dataset, aes(x=Day, y=location,fill=Frequency,text=text)) +
geom_tile() +
scale_fill_gradient(low="light yellow", high="dark blue") +
theme_ipsum()
p <- p + theme(axis.text.y = element_text(size = 8))
ggplotly(p, tooltip="text")
z <- ggplot(data = aggregate_cc, aes(x=Day, y=location,fill=Frequency,text=text2)) +
geom_tile() +
scale_fill_gradient(low="light yellow", high="red") +
theme_ipsum()
z <- z + theme(axis.text.y = element_text(size = 8))
ggplotly(z, tooltip="text2")
Based on heat maps which show the phenomena of loyalty card and credit_debit card usage, we can infer that the most popular places from Ja.06 to Jan.19 are Brew’ve Been Served and Katerina’s Cafe,since the color of heatmap are the most dark in these two places. However,from the tooltips, we can also see some difference between the frequencies of these two types of card usage, which are abnormal. So the next step is to build up new data frame to see the difference of cost record and frequency difference between these two types of cards more obviously.
loyalty_money <- aggregate_dataset %>% group_by(Day,location) %>% dplyr::summarise(max_loyal=max(Money_loyalty),freq_loyal=sum(Frequency))
head(aggregate_dataset)
# A tibble: 6 x 6
# Groups: timestamp [1]
timestamp location Frequency Money_loyalty Day text
<date> <chr> <dbl> <dbl> <chr> <chr>
1 2014-01-10 Abila Zach~ 7 171. 10 "Location: Abi~
2 2014-01-10 Albert's F~ 1 126. 10 "Location: Alb~
3 2014-01-10 Bean There~ 5 60.7 10 "Location: Bea~
4 2014-01-10 Brew've Be~ 14 132. 10 "Location: Bre~
5 2014-01-10 Brewed Awa~ 3 33.9 10 "Location: Bre~
6 2014-01-10 Carlyle Ch~ 2 3717. 10 "Location: Car~
head(loyalty_money)
# A tibble: 6 x 4
# Groups: Day [1]
Day location max_loyal freq_loyal
<chr> <chr> <dbl> <dbl>
1 06 Abila Airport 5930. 3
2 06 Abila Zacharo 160. 6
3 06 Albert's Fine Clothing 399. 2
4 06 Bean There Done That 44.4 5
5 06 Brew've Been Served 176. 17
6 06 Brewed Awakenings 27.0 3
dplyr::count(loyalty_money)
# A tibble: 14 x 2
# Groups: Day [14]
Day n
<chr> <int>
1 06 24
2 07 23
3 08 25
4 09 26
5 10 22
6 11 12
7 12 14
8 13 23
9 14 23
10 15 25
11 16 25
12 17 20
13 18 16
14 19 12
cc_money <- aggregate_cc %>% group_by(Day,location) %>% dplyr::summarise(max_cc = max(Money_cd),freq_cc=sum(Frequency))
head(cc_money)
# A tibble: 6 x 4
# Groups: Day [1]
Day location max_cc freq_cc
<chr> <chr> <dbl> <dbl>
1 06 Abila Airport 7803. 4
2 06 Abila Zacharo 380. 6
3 06 Albert's Fine Clothing 399. 2
4 06 Bean There Done That 44.4 5
5 06 Brew've Been Served 185. 16
6 06 Brewed Awakenings 27.0 3
dplyr::count(cc_money)
# A tibble: 14 x 2
# Groups: Day [14]
Day n
<chr> <int>
1 06 24
2 07 23
3 08 25
4 09 25
5 10 25
6 11 13
7 12 14
8 13 25
9 14 25
10 15 23
11 16 25
12 17 21
13 18 15
14 19 13
From the new data frame shown, we can see that some records that can be matched,whose frequencies are equal and money can be matched, but some are not. And most abnormal situation is that,in one day,the money cost shown in loyalty card is lower than money cost in credit/debit card,but the frequencies of loyalty card in that day is higher than that of credit card in same place,which needs to be noticed.
Another thoughts for Q1 Visualization Compared with Heat map, do we have better ways to visual, how about design line chart based on time period for the differences of money….
Add the vehicle data to your analysis of the credit and loyalty card data. How does your assessment of the anomalies in question 1 change based on this new data? What discrepancies between vehicle, credit, and loyalty card data do you find? Please limit your answer to 8 images and 500 words.
First,“MC2-tourist.jpg” is imported for data preparation.
bgmap <- raster("Data/MC2-tourist.jpg")
bgmap
class : RasterLayer
band : 1 (of 3 bands)
dimensions : 1535, 2740, 4205900 (nrow, ncol, ncell)
resolution : 1, 1 (x, y)
extent : 0, 2740, 0, 1535 (xmin, xmax, ymin, ymax)
crs : NA
source : MC2-tourist.jpg
names : MC2.tourist
values : 0, 255 (min, max)
tm_shape(bgmap) +
tm_rgb(bgmap, r = 1,g = 2,b = 3,
alpha = NA,
saturation = 1,
interpolate = TRUE,
max.value = 255)
Abila_st <- st_read(dsn = "Data/Geospatial",
layer = "Abila")
Reading layer `Abila' from data source
`C:\linanyaogaibian\Dataviz_blog\_posts\2021-07-13-assignment\Data\Geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 3290 features and 9 fields
Geometry type: LINESTRING
Dimension: XY
Bounding box: xmin: 24.82401 ymin: 36.04502 xmax: 24.90997 ymax: 36.09492
Geodetic CRS: WGS 84
glimpse(GPS)
Rows: 685,169
Columns: 4
$ Timestamp <chr> "1/6/2014 6:28", "1/6/2014 6:28", "1/6/2014 6:28",~
$ id <dbl> 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35~
$ lat <dbl> 36.07623, 36.07622, 36.07621, 36.07622, 36.07621, ~
$ long <dbl> 24.87469, 24.87460, 24.87444, 24.87425, 24.87417, ~
GPS$Timestamp <- date_time_parse(GPS$Timestamp,
zone = "",
format = "%m/%d/%Y %H:%M")
GPS$day <- as.factor(get_day(GPS$Timestamp))
GPS$id <- as_factor(GPS$id)
glimpse(GPS)
Rows: 685,169
Columns: 5
$ Timestamp <dttm> 2014-01-06 06:28:00, 2014-01-06 06:28:00, 2014-01~
$ id <fct> 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35~
$ lat <dbl> 36.07623, 36.07622, 36.07621, 36.07622, 36.07621, ~
$ long <dbl> 24.87469, 24.87460, 24.87444, 24.87425, 24.87417, ~
$ day <fct> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,~
GPS_sf <- st_as_sf(GPS,
coords = c("long", "lat"),
crs= 4326)
GPS_sf
Simple feature collection with 685169 features and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 24.82509 ymin: 36.04802 xmax: 24.90849 ymax: 36.08996
Geodetic CRS: WGS 84
# A tibble: 685,169 x 4
Timestamp id day geometry
* <dttm> <fct> <fct> <POINT [°]>
1 2014-01-06 06:28:00 35 6 (24.87469 36.07623)
2 2014-01-06 06:28:00 35 6 (24.8746 36.07622)
3 2014-01-06 06:28:00 35 6 (24.87444 36.07621)
4 2014-01-06 06:28:00 35 6 (24.87425 36.07622)
5 2014-01-06 06:28:00 35 6 (24.87417 36.07621)
6 2014-01-06 06:28:00 35 6 (24.87406 36.07619)
7 2014-01-06 06:28:00 35 6 (24.87391 36.07619)
8 2014-01-06 06:28:00 35 6 (24.87381 36.07618)
9 2014-01-06 06:28:00 35 6 (24.87374 36.07617)
10 2014-01-06 06:28:00 35 6 (24.87362 36.07618)
# ... with 685,159 more rows
gps_path <- GPS_sf %>%
group_by(id, day) %>%
summarize(m = mean(Timestamp),
do_union=FALSE) %>%
st_cast("LINESTRING")
gps_path
Simple feature collection with 508 features and 3 fields
Geometry type: LINESTRING
Dimension: XY
Bounding box: xmin: 24.82509 ymin: 36.04802 xmax: 24.90849 ymax: 36.08996
Geodetic CRS: WGS 84
# A tibble: 508 x 4
# Groups: id [40]
id day m geometry
<fct> <fct> <dttm> <LINESTRING [°]>
1 1 6 2014-01-06 15:01:40 (24.88258 36.06646, 24.88259 36.06~
2 1 7 2014-01-07 12:40:38 (24.87957 36.04803, 24.87957 36.04~
3 1 8 2014-01-08 14:34:56 (24.88265 36.06643, 24.88266 36.06~
4 1 9 2014-01-09 12:04:17 (24.88261 36.06646, 24.88257 36.06~
5 1 10 2014-01-10 16:04:29 (24.88265 36.0665, 24.88261 36.066~
6 1 11 2014-01-11 16:18:03 (24.88258 36.06651, 24.88246 36.06~
7 1 12 2014-01-12 13:30:36 (24.88259 36.06643, 24.8824 36.066~
8 1 13 2014-01-13 13:45:46 (24.88265 36.06642, 24.8826 36.066~
9 1 14 2014-01-14 14:03:54 (24.88261 36.06644, 24.88262 36.06~
10 1 15 2014-01-15 15:33:25 (24.88263 36.06647, 24.88257 36.06~
# ... with 498 more rows
gps_path_selected <- gps_path %>%
filter(id==1)
tmap_mode("view")
tm_shape(bgmap) +
tm_rgb(bgmap, r = 1,g = 2,b = 3,
alpha = NA,
saturation = 1,
interpolate = TRUE,
max.value = 255) +
tm_shape(gps_path_selected) +
tm_lines()